Time Series Analysis

Introduction

This document contains time series analysis for the DSAN5100 final project.

Load data files and preprocess

# Load required libraries for R
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(arrow)

Attaching package: 'arrow'

The following object is masked from 'package:lubridate':

    duration

The following object is masked from 'package:utils':

    timestamp
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:arrow':

    schema

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(zoo)

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
library(forecast)  
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
# Set Arrow option to skip null characters
options(arrow.skip_nul = TRUE)

# Set seed for reproducibility
set.seed(42)
# Load the data

argentina_df <- read_parquet("data/argentina_data_final.parquet")
canada_df <- read_parquet("data/canada_data_final.parquet")
china_df <- read_parquet("data/china_data_final.parquet")
india_df <- read_parquet("data/india_data_final.parquet")
italy_df <- read_parquet("data/italy_data_final.parquet")
russia_df <- read_parquet("data/russia_data_final.parquet")
us_df <- read_parquet("data/us_data_final.parquet")


head(us_df)
# A tibble: 6 × 16
     ID publishedAt    `source-name` location_code location category  year month
  <int> <chr>          <chr>         <chr>         <chr>    <chr>    <int> <int>
1   816 2020-08-07T11… Minneapolis … us            United … general   2020     8
2   901 2020-08-07T11… Google News   us            United … general   2020     8
3   961 2020-08-07T11… CNBC          us            United … general   2020     8
4  1054 2020-08-07T12… Google News   us            United … general   2020     8
5  1147 2020-08-07T12… CNET          us            United … general   2020     8
6  1335 2020-08-07T13… The Washingt… us            United … general   2020     8
# ℹ 8 more variables: new_title <chr>, neg <dbl>, neu <dbl>, pos <dbl>,
#   compound <dbl>, sentiment_category <chr>, bias_category <chr>,
#   bias_score <dbl>
# Change publishedAt column into a date type

# Function to convert publishedAt to date type
convert_date_column <- function(df) {
  df <- df %>%
    mutate(publishedAt = as.Date(publishedAt))
  return(df)
}

# Apply date conversion to all dataframes
argentina_df <- convert_date_column(argentina_df)
canada_df <- convert_date_column(canada_df)
china_df <- convert_date_column(china_df)
india_df <- convert_date_column(india_df)
italy_df <- convert_date_column(italy_df)
russia_df <- convert_date_column(russia_df)
us_df <- convert_date_column(us_df)

# Check the data type conversion
cat("Date type conversion completed:\n")
Date type conversion completed:
cat("publishedAt column type for US data:", class(us_df$publishedAt), "\n")
publishedAt column type for US data: Date 
# Check date range for various country data

cat("Date range for US data:", as.character(range(us_df$publishedAt, na.rm = TRUE)), "\n")
Date range for US data: 2015-04-01 2021-11-29 
cat("Date range for Argentina data:", as.character(range(argentina_df$publishedAt, na.rm = TRUE)), "\n")
Date range for Argentina data: 2016-10-02 2021-11-29 
cat("Date range for Canada data:", as.character(range(canada_df$publishedAt, na.rm = TRUE)), "\n")
Date range for Canada data: 2013-12-26 2021-11-29 
cat("Date range for China data:", as.character(range(china_df$publishedAt, na.rm = TRUE)), "\n")
Date range for China data: 2019-11-23 2021-11-29 
cat("Date range for India data:", as.character(range(india_df$publishedAt, na.rm = TRUE)), "\n")
Date range for India data: 2015-04-25 2021-11-29 
cat("Date range for Italy data:", as.character(range(italy_df$publishedAt, na.rm = TRUE)), "\n")
Date range for Italy data: 2016-07-10 2021-11-29 
cat("Date range for Russia data:", as.character(range(russia_df$publishedAt, na.rm = TRUE)), "\n")
Date range for Russia data: 2010-11-24 2021-11-29 

All of the countries have an end date of 2021-11-29 after which the data was no longer collected. The earliest data for the published articles range from 2010-11-24 for Russia to 2019-11-23 for China

# Combine the country datasets into df_countries

# Set Arrow option to handle null characters
options(arrow.skip_nul = TRUE)

df_countries <- bind_rows(
  us_df, 
  canada_df, 
  china_df, 
  india_df, 
  italy_df, 
  russia_df, 
  argentina_df
)
Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector

Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector

Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector
# Check the combined dataset
cat("Combined dataset dimensions:", nrow(df_countries), "rows,", ncol(df_countries), "columns\n")
Combined dataset dimensions: 796684 rows, 16 columns
cat("Countries included:", unique(df_countries$location), "\n")
Countries included: United States Canada China India Italy Russian Federation Argentina 
# Filter the dataset to start from 2020 for more focused analysis
df_countries_2020 <- df_countries %>% filter(year > 2019)

Analysis of sentiment and bias in USA news

Sentiment Analysis

# Average sentiment by category

# Transform df_country to only include USA, then date, categories as columns, and average sentiment as values
usa_df <- df_countries %>%
  filter(location == "United States") %>%
  mutate(year_month = floor_date(publishedAt, "month")) %>%
  group_by(category, year_month) %>%
  summarise(
    avg_compound = mean(compound, na.rm = TRUE),
    n_articles = n(),
    .groups = 'drop'
  ) %>%
  select(category, year_month, avg_compound) %>%
  pivot_wider(
    names_from = category, 
    values_from = avg_compound
  ) %>%
  arrange(year_month) %>%
  rename(date = year_month)

head(usa_df)
# A tibble: 6 × 8
  date       business entertainment general health science sports technology
  <date>        <dbl>         <dbl>   <dbl>  <dbl>   <dbl>  <dbl>      <dbl>
1 2015-04-01       NA            NA      NA  0.637  NA         NA         NA
2 2015-05-01       NA            NA      NA NA       0         NA         NA
3 2015-10-01       NA            NA      NA NA       0         NA         NA
4 2015-11-01       NA            NA      NA NA      -0.459     NA         NA
5 2016-05-01       NA            NA      NA NA       0         NA         NA
6 2016-06-01       NA            NA      NA  0      NA         NA         NA
# Time series plot for average sentiment by category

usa_category_plot <- ggplot(usa_df, aes(x = date)) + 
  geom_line(aes(y = general, color = "General"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = business, color = "Business"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = health, color = "Health"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = technology, color = "Technology"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = entertainment, color = "Entertainment"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = sports, color = "Sports"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = science, color = "Science"), size = 1, na.rm = TRUE) +
  labs(
    title = "Average Sentiment by Category for USA News",
    subtitle = "From 2020 - 2021",
    x = "Date",
    y = "Average sentiment score (-1 to 1)",
    color = "Category"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  scale_x_date(
    limits = c(as.Date("2020-01-01"), as.Date("2021-12-31")),
    date_breaks = "6 months", 
    date_labels = "%Y-%m"
  )

ggplotly(usa_category_plot) %>%
  layout(hovermode = "x")
# Compare average sentiment across categories (boxplot)

df_countries %>%
  filter(location == "United States") %>%
  ggplot(aes(x = category, y = compound, fill = category)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Sentiment Distribution by Category for USA News",
       subtitle = "From 2020 - 2021",
       x = "Category",
       y = "Compound Sentiment Score") +
  theme_minimal() +
  guides(fill = "none")

In general, science and sports have a higher moving average of sentiment across time due to the objective nature of science articles and exciting nature of entertainment headlines while general and health had the lowest sentiment due to the COVID-19 pandemic

# Summary Sentiment statistics by category for USA

usa_category_summary <- df_countries_2020 %>%
  filter(location == "United States") %>%
  group_by(category) %>%
  summarise(
    mean_sentiment = mean(compound, na.rm = TRUE),
    median_sentiment = median(compound, na.rm = TRUE),
    sd_sentiment = sd(compound, na.rm = TRUE),
    min_sentiment = min(compound, na.rm = TRUE),
    max_sentiment = max(compound, na.rm = TRUE),
    n_articles = n(),
    .groups = 'drop'
  ) %>%
  arrange(desc(mean_sentiment))

print(usa_category_summary)
# A tibble: 7 × 7
  category      mean_sentiment median_sentiment sd_sentiment min_sentiment
  <chr>                  <dbl>            <dbl>        <dbl>         <dbl>
1 technology           0.0541                 0        0.343        -0.931
2 sports               0.0537                 0        0.361        -0.97 
3 science              0.0394                 0        0.293        -0.926
4 entertainment        0.0255                 0        0.407        -0.945
5 business             0.00648                0        0.340        -0.919
6 health              -0.0485                 0        0.335        -0.940
7 general             -0.0883                 0        0.381        -0.952
# ℹ 2 more variables: max_sentiment <dbl>, n_articles <int>

Bias Analysis

# Average bias by category

# Transform df_country to only include USA, then date, categories as columns, and average bias as values
usa_bias_df <- df_countries %>%
  filter(location == "United States") %>%
  mutate(year_month = floor_date(publishedAt, "month")) %>%
  group_by(category, year_month) %>%
  summarise(
    avg_bias = mean(bias_score, na.rm = TRUE),
    n_articles = n(),
    .groups = 'drop'
  ) %>%
  select(category, year_month, avg_bias) %>%
  pivot_wider(
    names_from = category, 
    values_from = avg_bias
  ) %>%
  arrange(year_month) %>%
  rename(date = year_month)

head(usa_bias_df)
# A tibble: 6 × 8
  date       business entertainment general health science sports technology
  <date>        <dbl>         <dbl>   <dbl>  <dbl>   <dbl>  <dbl>      <dbl>
1 2015-04-01       NA            NA      NA  0.973  NA         NA         NA
2 2015-05-01       NA            NA      NA NA      -0.522     NA         NA
3 2015-10-01       NA            NA      NA NA       0.555     NA         NA
4 2015-11-01       NA            NA      NA NA      -0.659     NA         NA
5 2016-05-01       NA            NA      NA NA      -0.894     NA         NA
6 2016-06-01       NA            NA      NA  0.615  NA         NA         NA
# Time series plot for average bias by category

usa_bias_plot <- ggplot(usa_bias_df, aes(x = date)) + 
  geom_line(aes(y = general, color = "General"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = business, color = "Business"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = health, color = "Health"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = technology, color = "Technology"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = entertainment, color = "Entertainment"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = sports, color = "Sports"), size = 1, na.rm = TRUE) +
  geom_line(aes(y = science, color = "Science"), size = 1, na.rm = TRUE) +
  labs(
    title = "Average Bias by Category for USA News",
    subtitle = "From 2020 - 2021",
    x = "Date",
    y = "Average bias score (0 to 1)",
    color = "Category"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  scale_x_date(
    limits = c(as.Date("2020-01-01"), as.Date("2021-12-31")),
    date_breaks = "6 months", 
    date_labels = "%Y-%m"
  )

ggplotly(usa_bias_plot) %>%
  layout(hovermode = "x")
# Compare average bias across categories (boxplot)

df_countries %>%
  filter(location == "United States") %>%
  ggplot(aes(x = category, y = bias_score, fill = category)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Bias Distribution by Category for USA News",
       subtitle = "From 2020 - 2021",
       x = "Category",
       y = "Bias Score") +
  theme_minimal() +
  guides(fill = "none")

# Summary Bias statistics by category for USA bias

usa_bias_summary <- df_countries %>%
  filter(location == "United States") %>%
  group_by(category) %>%
  summarise(
    mean_bias = mean(bias_score, na.rm = TRUE),
    median_bias = median(bias_score, na.rm = TRUE),
    sd_bias = sd(bias_score, na.rm = TRUE),
    min_bias = min(bias_score, na.rm = TRUE),
    max_bias = max(bias_score, na.rm = TRUE),
    n_articles = n(),
    .groups = 'drop'
  ) %>%
  arrange(desc(mean_bias))

print(usa_bias_summary)
# A tibble: 7 × 7
  category      mean_bias median_bias sd_bias min_bias max_bias n_articles
  <chr>             <dbl>       <dbl>   <dbl>    <dbl>    <dbl>      <int>
1 science         0.292         0.606   0.702   -0.974    0.995       2895
2 entertainment   0.261         0.600   0.720   -0.973    0.995      19840
3 technology      0.177         0.558   0.741   -0.953    0.995       5823
4 general         0.0585        0.503   0.737   -0.979    0.995       6194
5 health          0.00663      -0.521   0.737   -0.986    0.995      17408
6 business       -0.00272      -0.526   0.737   -0.980    0.995      19089
7 sports         -0.0556       -0.542   0.714   -0.978    0.995      21248

Analysis Health, General, Science categories combined

In to capture the sentiment around COVID-19 and also leverage the scarse related data in the dataset, we have combined the Health, Science, and General category to explore the sentiment and bias during the COVID-19 pandemic (2020-02 to 2021-11)

# Combined time series plot for sentiment and bias across health, general, and science categories, daily

# Prepare daily combined data for USA with both sentiment and bias for health, general, and science categories
usa_combined_daily <- df_countries %>%
  filter(location == "United States", category %in% c("health", "general", "science", "business")) %>%
  group_by(publishedAt) %>%
  summarise(
    avg_sentiment = mean(compound, na.rm = TRUE),
    avg_bias = mean(bias_score, na.rm = TRUE),
    n_articles = n(),
    .groups = 'drop'
  ) %>%
  rename(date = publishedAt) %>%
  # Transform to long format for plotting
  pivot_longer(
    cols = c(avg_sentiment, avg_bias),
    names_to = "metric",
    values_to = "score"
  ) %>%
  mutate(
    metric = case_when(
      metric == "avg_sentiment" ~ "Sentiment",
      metric == "avg_bias" ~ "Bias"
    )
  )

head(usa_combined_daily)
# A tibble: 6 × 4
  date       n_articles metric     score
  <date>          <int> <chr>      <dbl>
1 2015-04-01          1 Sentiment  0.637
2 2015-04-01          1 Bias       0.973
3 2015-05-30          1 Sentiment  0    
4 2015-05-30          1 Bias      -0.522
5 2015-10-23          1 Sentiment  0    
6 2015-10-23          1 Bias       0.555
# Create combined plot with dual y-axes

# Create the base plot with sentiment and bias for combined health, general, and science categories
usa_combined_plot <- ggplot(usa_combined_daily, aes(x = date, y = score, color = metric)) +
  geom_line(size = 0.5, alpha = 0.7) +
  facet_wrap(~ metric, scales = "free_y", ncol = 1) +
  labs(
    title = "Daily Sentiment and Bias Trends for USA Health, General & Science News",
    subtitle = "From 2020 - 2021 (Combined Categories Analysis)",
    x = "Date",
    y = "Score",
    color = "Metric"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold")
  ) +
  scale_x_date(,
    date_breaks = "3 months", 
    date_labels = "%Y-%m"
  ) +
  scale_color_manual(values = c("Sentiment" = "#2E86AB", "Bias" = "#A23B72"))

ggplotly(usa_combined_plot) %>%
  layout(hovermode = "x")

Before 2020, the time series shows random bias and sentiment distribution

After the first confirmed U.S. COVID-19 case, news sentiment started neutral as initial reporting was factual. A sharp negative drop in May 2020 coincided with rapidly rising cases and crises in hospitals. Optimism returned briefly in July 2020 due to early vaccine trial progress and partial reopening measures. Sentiment then stabilized around neutral with occasional negative dips, including during the December 2020 holiday surge. In 2021, coverage remained neutral to slightly negative, but July 2021 saw the lowest sentiment of the year, corresponding to the Delta variant surge and heightened public health concerns

Sentiment during Christmas and New Year 2020–2021 was particularly low due to a combination of factors: a surge in COVID-19 cases following holiday gatherings, public health warnings discouraging travel and celebrations, widespread pandemic fatigue, and uncertainty surrounding the early vaccine rollout. Media coverage during this period was dominated by negative or cautionary language, leading to a pronounced dip in sentiment despite the holiday season.

# Decomposition of combined sentiment time series in the USA

# Prepare monthly aggregated sentiment data for USA (health, general, science combined)
usa_combined_monthly <- df_countries %>%
  filter(location == "United States", category %in% c("health", "general", "science")) %>%
  mutate(year_month = floor_date(publishedAt, "month")) %>%
  group_by(year_month) %>%
  summarise(avg_sentiment = mean(compound, na.rm = TRUE), .groups = 'drop') %>%
  arrange(year_month)

# Create time series object with monthly frequency
combined_ts <- ts(usa_combined_monthly$avg_sentiment, start = c(2018, 1), frequency = 12)

# Decompose (additive by default)
dec <- decompose(combined_ts)
# Time index
t <- time(combined_ts)

# Make four simple plots
p1 <- plot_ly(x = ~t, y = ~as.numeric(dec$x),        type = "scatter", mode = "lines", name = "Observed",  showlegend = FALSE)
p2 <- plot_ly(x = ~t, y = ~as.numeric(dec$trend),    type = "scatter", mode = "lines", name = "Trend",     showlegend = FALSE)
p3 <- plot_ly(x = ~t, y = ~as.numeric(dec$seasonal), type = "scatter", mode = "lines", name = "Seasonal",  showlegend = FALSE)
p4 <- plot_ly(x = ~t, y = ~as.numeric(dec$random),   type = "scatter", mode = "lines", name = "Remainder", showlegend = FALSE)

subplot(p1, p2, p3, p4, nrows = 4, shareX = TRUE) |>
  layout(
    title = "Decomposition of USA Combined Sentiment (Health, General, Science)",
    xaxis  = list(title = "Year"),
    yaxis  = list(title = "Observed", nticks = 5, tickfont = list(size = 10)),
    yaxis2 = list(title = "Trend",    nticks = 5, tickfont = list(size = 10)),
    yaxis3 = list(title = "Seasonal", nticks = 3, tickformat = ".3f", tickfont = list(size = 10)),
    yaxis4 = list(title = "Remainder",nticks = 3, tickformat = ".3f", tickfont = list(size = 10)),
    margin = list(l = 70, r = 10, t = 40, b = 40)
  )

The decomposition shows a gradual decline in overall news sentiment since 2015, reflecting increasingly neutral or negative coverage over time. Superimposed on this trend is a clear seasonal pattern, with predictable dips early in the year, spikes around mid-year events, smaller mid-year peaks, and another year-end spike likely tied to holidays or major announcements. These seasonal oscillations highlight the recurring rhythm of news sentiment in response to cyclical events.

# Decomposition of combined bias time series in the USA

# Prepare monthly aggregated bias data for USA (health, general, science combined)
usa_combined_bias_monthly <- df_countries %>%
  filter(location == "United States", category %in% c("health", "general", "science")) %>%
  mutate(year_month = floor_date(publishedAt, "month")) %>%
  group_by(year_month) %>%
  summarise(avg_bias = mean(bias_score, na.rm = TRUE), .groups = 'drop') %>%
  arrange(year_month)

# Create time series object with monthly frequency
combined_bias_ts <- ts(usa_combined_bias_monthly$avg_bias, start = c(2018, 1), frequency = 12)

# Decompose (additive by default)
dec <- decompose(combined_bias_ts)
# Time index
t <- time(combined_bias_ts)

# Make four simple plots
p1 <- plot_ly(x = ~t, y = ~as.numeric(dec$x),        type = "scatter", mode = "lines", name = "Observed",  showlegend = FALSE)
p2 <- plot_ly(x = ~t, y = ~as.numeric(dec$trend),    type = "scatter", mode = "lines", name = "Trend",     showlegend = FALSE)
p3 <- plot_ly(x = ~t, y = ~as.numeric(dec$seasonal), type = "scatter", mode = "lines", name = "Seasonal",  showlegend = FALSE)
p4 <- plot_ly(x = ~t, y = ~as.numeric(dec$random),   type = "scatter", mode = "lines", name = "Remainder", showlegend = FALSE)

subplot(p1, p2, p3, p4, nrows = 4, shareX = TRUE) |>
  layout(
    title = "Decomposition of USA Combined Bias (Health, General, Science)",
    xaxis  = list(title = "Year"),
    yaxis  = list(title = "Observed", nticks = 5, tickfont = list(size = 10)),
    yaxis2 = list(title = "Trend",    nticks = 5, tickfont = list(size = 10)),
    yaxis3 = list(title = "Seasonal", nticks = 3, tickformat = ".3f", tickfont = list(size = 10)),
    yaxis4 = list(title = "Remainder",nticks = 3, tickformat = ".3f", tickfont = list(size = 10)),
    margin = list(l = 70, r = 10, t = 40, b = 40)
  )
# Create US data frame with publishedAt, category, year, month, avg_compound, and avg_bias

# Create aggregated US data frame by date and category
us_daily_combinedC_df <- df_countries %>%
  filter(location == "United States", category %in% c("health", "general", "science")) %>%
  group_by(publishedAt, category) %>%
  summarise(
    avg_compound = mean(compound, na.rm = TRUE),
    avg_bias = mean(bias_score, na.rm = TRUE),
    n_articles = n(),
    .groups = 'drop'
  ) %>%
  mutate(
    year = year(publishedAt),
    month = month(publishedAt)
  ) %>%
  select(publishedAt, category, year, month, avg_compound, avg_bias, n_articles) %>%
  arrange(publishedAt, category)

# Display first few rows and summary
head(us_daily_combinedC_df, 10)
# A tibble: 10 × 7
   publishedAt category  year month avg_compound avg_bias n_articles
   <date>      <chr>    <dbl> <dbl>        <dbl>    <dbl>      <int>
 1 2015-04-01  health    2015     4       0.637   0.973            1
 2 2015-05-30  science   2015     5       0      -0.522            1
 3 2015-10-23  science   2015    10       0       0.555            1
 4 2015-11-11  science   2015    11      -0.459  -0.659            1
 5 2016-05-29  science   2016     5       0      -0.894            2
 6 2016-06-28  health    2016     6       0       0.615            1
 7 2017-06-06  science   2017     6       0.0505  0.392           17
 8 2017-06-08  science   2017     6       0.148   0.0313           2
 9 2017-06-09  science   2017     6       0.0502 -0.00601          2
10 2017-08-08  health    2017     8       0       0.796            1
# Extract day from date e.g., 1 for Sunday, 2 for Monday, etc.

# Add day of week information to the US daily dataframe
us_daily_combinedC_df <- us_daily_combinedC_df %>%
  mutate(
    # wday() returns 1 for Sunday, 2 for Monday, ..., 7 for Saturday
    day_of_week = wday(publishedAt),
    # wday() with label=TRUE, abbr=FALSE returns full day names  
    day_full_name = wday(publishedAt, label = TRUE, abbr = FALSE)
  )

# Display updated dataframe with day information
head(us_daily_combinedC_df, 10)
# A tibble: 10 × 9
   publishedAt category  year month avg_compound avg_bias n_articles day_of_week
   <date>      <chr>    <dbl> <dbl>        <dbl>    <dbl>      <int>       <dbl>
 1 2015-04-01  health    2015     4       0.637   0.973            1           4
 2 2015-05-30  science   2015     5       0      -0.522            1           7
 3 2015-10-23  science   2015    10       0       0.555            1           6
 4 2015-11-11  science   2015    11      -0.459  -0.659            1           4
 5 2016-05-29  science   2016     5       0      -0.894            2           1
 6 2016-06-28  health    2016     6       0       0.615            1           3
 7 2017-06-06  science   2017     6       0.0505  0.392           17           3
 8 2017-06-08  science   2017     6       0.148   0.0313           2           5
 9 2017-06-09  science   2017     6       0.0502 -0.00601          2           6
10 2017-08-08  health    2017     8       0       0.796            1           3
# ℹ 1 more variable: day_full_name <ord>
# Run a regression sentiment score on bias score, category, day_of_week

us_sentiment_combinedC_model <- lm(avg_compound ~ avg_bias + as.factor(day_of_week) + as.factor(category), us_daily_combinedC_df)

summary(us_sentiment_combinedC_model)

Call:
lm(formula = avg_compound ~ avg_bias + as.factor(day_of_week) + 
    as.factor(category), data = us_daily_combinedC_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.55521 -0.06109 -0.00585  0.05835  0.66994 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                -0.086675   0.010590  -8.185 6.21e-16 ***
avg_bias                    0.020295   0.012107   1.676   0.0939 .  
as.factor(day_of_week)2     0.021759   0.013180   1.651   0.0990 .  
as.factor(day_of_week)3     0.010480   0.013157   0.797   0.4258    
as.factor(day_of_week)4     0.010694   0.013239   0.808   0.4194    
as.factor(day_of_week)5    -0.002696   0.013074  -0.206   0.8367    
as.factor(day_of_week)6     0.009995   0.013025   0.767   0.4430    
as.factor(day_of_week)7    -0.003128   0.013220  -0.237   0.8130    
as.factor(category)health   0.034281   0.008553   4.008 6.45e-05 ***
as.factor(category)science  0.119142   0.008989  13.254  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1294 on 1360 degrees of freedom
Multiple R-squared:  0.1439,    Adjusted R-squared:  0.1383 
F-statistic:  25.4 on 9 and 1360 DF,  p-value: < 2.2e-16

The linear regression shows that headline category is the strongest predictor of daily average sentiment: science news is associated with the highest positive shift, followed by health news. Bias has a small positive effect on sentiment, though only marginally significant. Day-of-week effects are negligible. Overall, the model explains about 14% of the variation in daily sentiment, indicating that much of sentiment variation is driven by factors not captured in this model

# Convert residuals to time series component and look at the residuals ACF

# Extract residuals from the regression model
residuals_data <- residuals(us_sentiment_combinedC_model)

# Create a dataframe with residuals and dates for proper ordering
residuals_df <- data.frame(
  residuals = residuals_data,
  publishedAt = us_daily_combinedC_df$publishedAt,
  category = us_daily_combinedC_df$category
) %>%
  arrange(publishedAt)

# Convert residuals to time series object
# Note: Since we have irregular time series (not every day has data for every category),
# we'll work with the residuals as a vector for ACF analysis
res_fit <- residuals_df$residuals

# Create ACF plot for residuals
acf_plot <- ggAcf(res_fit, lag.max = 40) +
  labs(
    title = "Autocorrelation Function (ACF) of Regression Residuals",
    subtitle = "US Daily News Sentiment Model Residuals",
    x = "Lag",
    y = "ACF"
  ) +
  theme_minimal()

print(acf_plot)

The ACF plot of regression residuals shows low autocorrelation, with only three minor spikes outside the confidence bounds: at lag 2, and around lags 30–35. This indicates that the model largely captures the systematic variation in sentiment, and the residuals behave approximately as white noise. The spikes may reflect small short-term or monthly cycles in news sentiment not fully accounted for by the model

# Lag plot for sentiment in USA

# Create a time series of daily average sentiment for USA
usa_sentiment_combinedC_daily <- us_daily_combinedC_df %>%
  group_by(publishedAt) %>%
  summarise(
    daily_avg_sentiment = mean(avg_compound, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  arrange(publishedAt) %>%
  filter(!is.na(daily_avg_sentiment))

# Convert to time series object
usa_sentiment_combinedC.ts <- ts(usa_sentiment_combinedC_daily$daily_avg_sentiment, frequency = 365)

# Create lag plots with 4 different lags
gglagplot(usa_sentiment_combinedC.ts, do.lines = FALSE, lags = 4) +
  labs(
    title = "Lag Plots: USA Daily Sentiment Analysis",
    subtitle = "Autocorrelation patterns in daily sentiment data",
    x = "Y(t-k)",
    y = "Y(t)"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(face = "bold"),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12)
  )

The lag plot of daily sentiment shows points scattered around zero, forming a roughly circular pattern. This indicates that consecutive daily sentiment values are largely independent, with no strong linear autocorrelation. The circular scatter may hint at mild non-linear dependencies or recurring cycles, but overall, sentiment appears mostly driven by individual events rather than day-to-day persistence.

# Lag plot for bias in USA

# Create a time series of daily average bias for USA
usa_bias_combinedC_daily <- us_daily_combinedC_df %>%
  group_by(publishedAt) %>%
  summarise(
    daily_avg_bias = mean(avg_bias, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  arrange(publishedAt) %>%
  filter(!is.na(daily_avg_bias))

# Convert to time series object
usa_bias_combinedC.ts <- ts(usa_bias_combinedC_daily$daily_avg_bias, frequency = 365)

# Create lag plots with 4 different lags
gglagplot(usa_bias_combinedC.ts, do.lines = FALSE, lags = 4) +
  labs(
    title = "Lag Plots: USA Daily Bias Analysis",
    subtitle = "Autocorrelation patterns in daily bias data",
    x = "Y(t-k)",
    y = "Y(t)"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(face = "bold"),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12)
  )

The lag plot of daily bias shows points scattered around zero, forming a roughly circular pattern. This indicates that consecutive daily sentiment values are largely independent, with no strong linear autocorrelation. The circular scatter may hint at mild non-linear dependencies or recurring cycles, but overall, sentiment appears mostly driven by individual events rather than day-to-day persistence.

# ACF plot for USA daily sentiment
ggAcf(usa_sentiment_combinedC_daily$daily_avg_sentiment, lag.max = 40) +
  labs(
    title = "Autocorrelation Function (ACF): USA Daily Sentiment",
    subtitle = "Identifying temporal dependencies in daily sentiment data",
    x = "Lag (days)",
    y = "ACF"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12)
  )

The ACF of the daily sentiment series (lags 1–40) shows that most autocorrelations fall within the confidence bounds, indicating weak temporal dependence. Only five lags show significant autocorrelation, suggesting minor short-term or periodic effects in news sentiment. Overall, sentiment appears largely event-driven rather than strongly influenced by preceding days.

While the sentiment series exhibits weak autocorrelation, the presence of seasonal cycles and minor short-term dependencies justifies exploring time series models, particularly ones that can incorporate exogenous event-related variables. Pure autoregressive models may have limited predictive power due to the event-driven nature of sentiment, but a seasonal or dynamic regression framework could improve forecasting and provide insight into temporal patterns

# PACF plot for USA daily sentiment
ggPacf(usa_sentiment_combinedC_daily$daily_avg_sentiment, lag.max = 40) +
  labs(
    title = "Partial Autocorrelation Function (PACF): USA Daily Sentiment",
    subtitle = "Identifying direct temporal relationships in daily sentiment data",
    x = "Lag (days)",
    y = "PACF"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12)
  )

The PACF of daily sentiment shows only four lags with significant partial autocorrelation, indicating that short-term dependencies exist for a few immediate days, but beyond that, sentiment behaves largely independently. This suggests that if a time series model is applied, only a low-order autoregressive component is needed, and incorporating seasonality or event-based regressors would likely provide more explanatory power.

Analysis of top sources for health and science

# View main sources of news for science and health category in the USA

# Analyze top news sources for health and science categories
usa_sources_health_science <- df_countries %>%
  filter(location == "United States", category %in% c("health", "science")) %>%
  group_by(category, `source-name`) %>%
  summarise(
    n_articles = n(),
    avg_sentiment = mean(compound, na.rm = TRUE),
    avg_bias = mean(bias_score, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  arrange(category, desc(n_articles))

# Get top 10 sources for each category
top_sources_health <- usa_sources_health_science %>%
  filter(category == "health") %>%
  slice_head(n = 10)

top_sources_science <- usa_sources_health_science %>%
  filter(category == "science") %>%
  slice_head(n = 10)

# Display the results
cat("Top 10 Health News Sources in USA (2020-2021):\n")
Top 10 Health News Sources in USA (2020-2021):
print(top_sources_health)
# A tibble: 10 × 5
   category `source-name`       n_articles avg_sentiment avg_bias
   <chr>    <chr>                    <int>         <dbl>    <dbl>
 1 health   Google News                632       -0.0422   0.110 
 2 health   Yahoo Entertainment        630       -0.0427   0.135 
 3 health   CNN                        426       -0.103    0.182 
 4 health   New York Times             357       -0.0601   0.586 
 5 health   Fox News                   344       -0.0605  -0.0684
 6 health   Eatthis.com                189       -0.0302  -0.0889
 7 health   USA Today                  188       -0.0689   0.0879
 8 health   New York Post              170       -0.167    0.235 
 9 health   Lost Coast Outpost         168       -0.316   -0.706 
10 health   NPR                        163       -0.0767   0.0140
cat("\nTop 10 Science News Sources in USA (2020-2021):\n")

Top 10 Science News Sources in USA (2020-2021):
print(top_sources_science)
# A tibble: 10 × 5
   category `source-name`       n_articles avg_sentiment avg_bias
   <chr>    <chr>                    <int>         <dbl>    <dbl>
 1 science  Phys.Org                   193        0.0702    0.519
 2 science  CNN                        160        0.0135    0.296
 3 science  Yahoo Entertainment        124        0.0944    0.314
 4 science  Space.com                  119        0.0488    0.329
 5 science  SciTechDaily               112        0.115     0.428
 6 science  New York Times              93       -0.0236    0.816
 7 science  NASA                        81        0.0905    0.254
 8 science  Live Science                80       -0.0307    0.147
 9 science  Fox News                    79        0.0239    0.214
10 science  Business Insider            72       -0.0254    0.109
# Create visualization for top news sources by article count

# Health News Sources Visualization
df_usa_health <- df_countries_2020 %>%
  filter(location == "United States", category == "health")

# Top 20 health news sources
top_health_sources <- df_usa_health %>%
  count(`source-name`, sort = TRUE) %>%
  slice_head(n = 20)

# Horizontal bar chart for health sources
health_sources_plot <- plot_ly(
  data = top_health_sources,
  x = ~n,
  y = ~reorder(`source-name`, n),
  type = 'bar',
  orientation = 'h',
  marker = list(color = '#E74C3C'),
  name = 'Health Sources'
) %>%
  layout(
    title = list(
      text = 'Top 20 USA Health News Sources by Article Count (2020-2021)',
      font = list(family = "Times New Roman")
    ),
    xaxis = list(title = 'Article Count'),
    yaxis = list(title = 'News Source'),
    margin = list(l = 150)
  )

health_sources_plot
# Science News Sources Visualization
df_usa_science <- df_countries_2020 %>%
  filter(location == "United States", category == "science")

# Top 20 science news sources
top_science_sources <- df_usa_science %>%
  count(`source-name`, sort = TRUE) %>%
  slice_head(n = 20)

# Horizontal bar chart for science sources
science_sources_plot <- plot_ly(
  data = top_science_sources,
  x = ~n,
  y = ~reorder(`source-name`, n),
  type = 'bar',
  orientation = 'h',
  marker = list(color = '#3498DB'),
  name = 'Science Sources'
) %>%
  layout(
    title = list(
      text = 'Top 20 USA Science News Sources by Article Count (2020-2021)',
      font = list(family = "Times New Roman")
    ),
    xaxis = list(title = 'Article Count'),
    yaxis = list(title = 'News Source'),
    margin = list(l = 150)
  )

science_sources_plot
# Average Sentiment for top 5 health sources

# Get top 5 health sources for sentiment analysis
top_5_health_sentiment <- top_sources_health %>%
  slice_head(n = 5)

# Create vertical bar chart for health sentiment
health_sentiment_plot <- plot_ly(
  data = top_5_health_sentiment,
  x = ~reorder(`source-name`, avg_sentiment),
  y = ~avg_sentiment,
  type = 'bar',
  marker = list(color = '#27AE60'),
  name = 'Health Sentiment',
  text = ~paste("Articles:", n_articles, "<br>Sentiment:", round(avg_sentiment, 3)),
  textposition = 'auto'
) %>%
  layout(
    title = list(
      text = 'Average Sentiment for Top 5 USA Health News Sources (2020-2021)',
      font = list(family = "Times New Roman")
    ),
    xaxis = list(title = 'News Source'),
    yaxis = list(title = 'Average Sentiment Score'),
    margin = list(b = 100)
  )

health_sentiment_plot
# Average Bias for top 5 health sources

# Get top 5 health sources for bias analysis
top_5_health_bias <- top_sources_health %>%
  slice_head(n = 5)

# Create vertical bar chart for health bias
health_bias_plot <- plot_ly(
  data = top_5_health_bias,
  x = ~reorder(`source-name`, avg_bias),
  y = ~avg_bias,
  type = 'bar',
  marker = list(color = '#E67E22'),
  name = 'Health Bias',
  text = ~paste("Articles:", n_articles, "<br>Bias:", round(avg_bias, 3)),
  textposition = 'auto'
) %>%
  layout(
    title = list(
      text = 'Average Bias for Top 5 USA Health News Sources (2020-2021)',
      font = list(family = "Times New Roman")
    ),
    xaxis = list(title = 'News Source'),
    yaxis = list(title = 'Average Bias Score'),
    margin = list(b = 100)
  )

health_bias_plot
# Average Sentiment for top 5 science sources

# Get top 5 science sources for sentiment analysis
top_5_science_sentiment <- top_sources_science %>%
  slice_head(n = 5)

# Create vertical bar chart for science sentiment
science_sentiment_plot <- plot_ly(
  data = top_5_science_sentiment,
  x = ~reorder(`source-name`, avg_sentiment),
  y = ~avg_sentiment,
  type = 'bar',
  marker = list(color = '#8E44AD'),
  name = 'Science Sentiment',
  text = ~paste("Articles:", n_articles, "<br>Sentiment:", round(avg_sentiment, 3)),
  textposition = 'auto'
) %>%
  layout(
    title = list(
      text = 'Average Sentiment for Top 5 USA Science News Sources (2020-2021)',
      font = list(family = "Times New Roman")
    ),
    xaxis = list(title = 'News Source'),
    yaxis = list(title = 'Average Sentiment Score'),
    margin = list(b = 100)
  )

science_sentiment_plot
# Average Bias for top 5 science sources

# Get top 5 science sources for bias analysis
top_5_science_bias <- top_sources_science %>%
  slice_head(n = 5)

# Create vertical bar chart for science bias
science_bias_plot <- plot_ly(
  data = top_5_science_bias,
  x = ~reorder(`source-name`, avg_bias),
  y = ~avg_bias,
  type = 'bar',
  marker = list(color = '#E74C3C'),
  name = 'Science Bias',
  text = ~paste("Articles:", n_articles, "<br>Bias:", round(avg_bias, 3)),
  textposition = 'auto'
) %>%
  layout(
    title = list(
      text = 'Average Bias for Top 5 USA Science News Sources (2020-2021)',
      font = list(family = "Times New Roman")
    ),
    xaxis = list(title = 'News Source'),
    yaxis = list(title = 'Average Bias Score'),
    margin = list(b = 100)
  )

science_bias_plot